Row

Overview

Data Load

Several of the top data scientist’s information was stored in in a CSV file and was tidy up using dplyr libraries.

SQL Creation and Insertion

DBI was used to create tables on the fly with an Azure SQL DB for MySQL.

   skillid         skillname
1        1          Research
2        2        Statistics
3        3 Business Strategy
4        4     Data Analysis
5        5            Python
6        6   Microsoft Excel
7        7       Mathematica
8        8              Java
9        9                 R
10      10  Machine Learning
   personid                     title            name
1         1  Chief Decision Scientist Cassie Kozyrkov
2         2      Head of Data Science    Elena Grewal
3         3            Data Scientist    Lukas Hansen
4         4    Data Science Associate        Mike Zhu
5         5            Data Scientist William Roberts
6         6            Data Scientist      Daniel Law
7         7            Data Scientist       HiJae Kim
8         8       Lead Data Scientist     Scott Ogden
9         9 AI Engineer, Data Analyst Naoki Ishibashi
10       10            Data Scientist      Eric Morin
                 education degree      location          company
1          Duke University    PhD      New York           Google
2      Stanford University    PhD San Francisco           Airbnb
3  Northwestern University    MBA      New York            Aetna
4                      NYU     MS      New York     Oscar Health
5                      NYU     BA      New York Cityblock Health
6          Amherst College     BA San Francisco         Facebook
7          Carnegie Mellon     BS      New York         DailyPay
8   Stony Brook University     MS      New York      Healthfirst
9                     CUNY     MS         Tokyo         Softbank
10                    UCSD     BA         Tokyo          Rakuten

Web Scraping Information

Word Cloud

Data Scientists Common Locations

Conclusion

---
title: "Project 3 - Data Science Skills"
author: "David Apolinar, Anthony Muñoz, Christina Valore, Omar Pineda."
date: "3/24/2019"
output: 
  flexdashboard::flex_dashboard:
    orientation: rows
    source_code: embed
---

Row {.tabset .tabset-fade}
-------------------------------------

### Overview


### Data Load

Several of the top data scientist's information was stored in in a CSV file and was tidy up using dplyr libraries.

```{r data load}
library(tidyr)
library(wordcloud)
library(tm)
library(SnowballC)
library(RColorBrewer)
library(magrittr)
library(DBI)
library(dplyr)
#  Data Files
ds <- read.csv("https://raw.githubusercontent.com/omarp120/DATA607Week8/master/DataScientists.csv", header=TRUE, stringsAsFactors = FALSE)
tidyDS <- gather(ds, "Number", "Skill", Skill1:Skill50) #makes data tall
finalDS <- tidyDS[tidyDS$Skill != "",] #removes rows with empty skill values
counts <- as.data.frame(table(finalDS$Skill)) #creates a data frame with skill frequencies

```

### SQL Creation and Insertion

DBI was used to create tables on the fly with an Azure SQL DB for MySQL. 

```{r SQL dataload}
# Create Skill Table
skilltable <- unique(finalDS$Skill)
skilltable <- as.data.frame(skilltable, stringsAsFactors = FALSE)

skillids <- 1:nrow(skilltable)

skilltable <- cbind.data.frame(skilltable,skillids)
names(skilltable) <- c("SkillName", "SkillID")

# Run SQL statements to create tables 

con <- dbConnect(RMariaDB::MariaDB(), user='x-admin@cunyspsmysql.mysql.database.azure.com', password="7dWa0XUVHtlCJMS", dbname='datascientists' ,host='cunyspsmysql.mysql.database.azure.com')

rs<- dbSendStatement(con, "drop table if exists person_skills;")
dbClearResult(rs)
rs<-dbSendStatement(con, "drop table if exists person;")
dbClearResult(rs)
rs<-dbSendStatement(con, "drop table if exists skills;")
dbClearResult(rs)

rs <- dbSendStatement(con, "CREATE TABLE person (
                personid int NOT NULL auto_increment primary key,
                title nchar(50),
                name nchar(50) NOT NULL,
                education nchar(50),
                degree nchar(50),
                location nchar(50),
                company nchar(50));")
dbClearResult(rs)
rs<- dbSendStatement(con, "CREATE TABLE skills (
                skillid int NOT NULL auto_increment primary key,
                skillname nchar(50) NOT NULL);")
dbClearResult(rs)
rs<- dbSendStatement(con, "CREATE TABLE person_skills (
personid int NOT NULL references person(personid),
                skillid int NOT NULL references skills(skillid),
                CONSTRAINT person_skill primary key(personid, skillid));")
dbClearResult(rs)
dbDisconnect(con)
# Create SQL Connection

con <- dbConnect(RMariaDB::MariaDB(), user='x-admin@cunyspsmysql.mysql.database.azure.com', password="7dWa0XUVHtlCJMS", dbname='datascientists' ,host='cunyspsmysql.mysql.database.azure.com')

#mysql_datascientists <- dbGetQuery(con, 'select * from skills')
for(i in 1:nrow(skilltable))
{
#  print(paste0("Inserting Skill: ", skilltable[i,]$SkillName, ", SkillID: ", skilltable[i,]$SkillID) )
  sql <- sprintf("insert into skills
                  (skillname, skillid)
               values ('%s', %d);",
               skilltable[i,]$SkillName, skilltable[i,]$SkillID)
  rs <- dbSendQuery(con, sql)
  dbClearResult(rs)
}

mysql_dataskills <- dbGetQuery(con, 'select * from skills
                               limit 10')
mysql_dataskills
dbDisconnect(con)

# Get Unique People to Insert
con <- dbConnect(RMariaDB::MariaDB(), user='x-admin@cunyspsmysql.mysql.database.azure.com', password="7dWa0XUVHtlCJMS", dbname='datascientists' ,host='cunyspsmysql.mysql.database.azure.com')

people_table <- finalDS %>% select(ID,Person, Title, School, HighestLevel, Location, Company) %>% unique()

for(i in 1:nrow(people_table))
{
#  print(paste0("Inserting Person: ", 
#               people_table[i,]$Person, ", Title: ", 
#               people_table[i,]$Title, "School: ", 
#               people_table[i,]$School, ", Degree: ", 
#               people_table[i,]$HighestLevel, ", Location: ", 
#               people_table[i,]$Location, ", Company: ",
#               people_table[i,]$Company))
  sql <- sprintf("insert into person
                 (name, title, education, degree, location, company)
                 values ('%s', '%s', '%s','%s', '%s', '%s');",
                 people_table[i,]$Person, 
                 people_table[i,]$Title, 
                 people_table[i,]$School,
                 people_table[i,]$HighestLevel, 
                 people_table[i,]$Location,
                 people_table[i,]$Company)
  rs <- dbSendQuery(con, sql)
  dbClearResult(rs)
}

mysql_datascientists <- dbGetQuery(con, 'select * from person
                                   limit 10;')
mysql_datascientists
dbDisconnect(con)



# Create Many to Many Relationship
linkdb<- tidyDS %>% select(ID, Skill)
returnIndex <- function(n)
{
  for(i in 1:nrow(n))
  {
    
    return (skilltable$SkillID[skilltable$SkillName == n[i,]$Skill])
  }

}
# Remove duplicate rows
person_skill <- finalDS %>% select(ID, Person, Skill) %>% distinct()

#returnIndex(linkdb[478,])

# Create Link Table
con <- dbConnect(RMariaDB::MariaDB(), user='x-admin@cunyspsmysql.mysql.database.azure.com', password="7dWa0XUVHtlCJMS", dbname='datascientists' ,host='cunyspsmysql.mysql.database.azure.com')


for(i in 1:nrow(person_skill))
{
  if(length(returnIndex(person_skill[i,])) != 0)
  {
#    print(paste0("Inserting (PersonID: ", person_skill[i,]$ID, " SkillID: ", returnIndex(person_skill[i,]),")") )
    
    sql <- sprintf("insert into person_skills
                 (personid, skillid)
                   values (%d, %d);",
                   person_skill[i,]$ID, returnIndex(person_skill[i,]))
    rs <- dbSendQuery(con, sql)
    dbClearResult(rs)
  }else
  {
    print("Empty Skill Value, skipping link")
  }
}

dbDisconnect(con)
```
### Web Scraping Information



### Word Cloud

```{r data vis}
colnames(counts) <- c("Skill", "Freq")
wordcloud(counts$Skill, counts$Freq, random.order = FALSE, scale = c(2, 0.10), colors=brewer.pal(8, "Dark2"))
```

### Data Scientists Common Locations

```{r goecode}

#code adapted from http://www.storybench.org/geocode-csv-addresses-r/
#library(ggmap)
#register_google(key = "xxx") #removed personal API key
# Initialize the data frame
#getOption("ggmap")
# Loop through the addresses to get the latitude and longitude of each address and add it to the
# origAddress data frame in new columns lat and lon
#for(i in 1:nrow(ds))
#{
  # Print("Working...")
#  result <- geocode(ds$Location[i], output = "latlon", source = "google")
#  ds$lon[i] <- as.numeric(result[1])
 # ds$lat[i] <- as.numeric(result[2])
#}
# Write a CSV file containing origAddress to the working directory
#write.csv(ds, "geocoded.csv", row.names=FALSE)
```

```{r map}
library(leaflet)
cities <- read.csv("https://raw.githubusercontent.com/omarp120/DATA607Week8/master/geocoded.csv")
cities  %>% 
  leaflet() %>% 
  addTiles() %>% 
  addMarkers(clusterOption=markerClusterOptions())
```


### Conclusion